import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel('dataset.xlsx')
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)
df
| Patient ID | Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Serum Glucose | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Mycoplasma pneumoniae | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Neutrophils | Urea | Proteina C reativa mg/dL | Creatinine | Potassium | Sodium | Influenza B, rapid test | Influenza A, rapid test | Alanine transaminase | Aspartate transaminase | Gamma-glutamyltransferase | Total Bilirubin | Direct Bilirubin | Indirect Bilirubin | Alkaline phosphatase | Ionized calcium | Strepto A | Magnesium | pCO2 (venous blood gas analysis) | Hb saturation (venous blood gas analysis) | Base excess (venous blood gas analysis) | pO2 (venous blood gas analysis) | Fio2 (venous blood gas analysis) | Total CO2 (venous blood gas analysis) | pH (venous blood gas analysis) | HCO3 (venous blood gas analysis) | Rods # | Segmented | Promyelocytes | Metamyelocytes | Myelocytes | Myeloblasts | Urine - Esterase | Urine - Aspect | Urine - pH | Urine - Hemoglobin | Urine - Bile pigments | Urine - Ketone Bodies | Urine - Nitrite | Urine - Density | Urine - Urobilinogen | Urine - Protein | Urine - Sugar | Urine - Leukocytes | Urine - Crystals | Urine - Red blood cells | Urine - Hyaline cylinders | Urine - Granular cylinders | Urine - Yeasts | Urine - Color | Partial thromboplastin time (PTT) | Relationship (Patient/Normal) | International normalized ratio (INR) | Lactic Dehydrogenase | Prothrombin time (PT), Activity | Vitamin B12 | Creatine phosphokinase (CPK) | Ferritin | Arterial Lactic Acid | Lipase dosage | D-Dimer | Albumin | Hb saturation (arterial blood gases) | pCO2 (arterial blood gas analysis) | Base excess (arterial blood gas analysis) | pH (arterial blood gas analysis) | Total CO2 (arterial blood gas analysis) | HCO3 (arterial blood gas analysis) | pO2 (arterial blood gas analysis) | Arteiral Fio2 | Phosphor | ctO2 (arterial blood gas analysis) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44477f75e8169d2 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 126e9dd13932f68 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | -0.140648 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | -0.619086 | 1.198059 | -0.147895 | 2.089928 | -0.305787 | 0.862512 | negative | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | a46b4402a0e5696 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | f7d619a94f97c45 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | d9e41465789c2b5 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5639 | ae66feb9e4dc3a0 | 3 | positive | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5640 | 517c2834024f3ea | 17 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5641 | 5c57d6037fe266d | 4 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5642 | c20c44766f28291 | 10 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | clear | 5 | absent | absent | absent | NaN | -0.338525 | normal | absent | NaN | 29000 | Ausentes | -0.177169 | absent | absent | absent | yellow | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5643 | 2697fdccbfeb7f7 | 19 | positive | 0 | 0 | 0 | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.380685 | 0.453725 | -0.503570 | -0.735872 | -0.552949 | -0.934388 | NaN | NaN | -0.28361 | 0.108761 | -0.420454 | -0.480996 | -0.586463 | -0.278654 | -0.243405 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.420204 | NaN | NaN | -0.343291 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5644 rows × 111 columns
df.shape
(5644, 111)
df.dtypes.value_counts()
float64 70 object 37 int64 4 dtype: int64
plt.figure(figsize=(20,10))
sns.heatmap(df.isna())
<AxesSubplot: >
(df.isna().sum()/df.shape[0]).sort_values(ascending=True)
Patient ID 0.000000 Patient age quantile 0.000000 SARS-Cov-2 exam result 0.000000 Patient addmited to regular ward (1=yes, 0=no) 0.000000 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.000000 Patient addmited to intensive care unit (1=yes, 0=no) 0.000000 Influenza B 0.760099 Respiratory Syncytial Virus 0.760099 Influenza A 0.760099 Rhinovirus/Enterovirus 0.760454 Inf A H1N1 2009 0.760454 CoronavirusOC43 0.760454 Coronavirus229E 0.760454 Parainfluenza 4 0.760454 Adenovirus 0.760454 Chlamydophila pneumoniae 0.760454 Parainfluenza 3 0.760454 Coronavirus HKU1 0.760454 CoronavirusNL63 0.760454 Parainfluenza 1 0.760454 Bordetella pertussis 0.760454 Parainfluenza 2 0.760454 Metapneumovirus 0.760454 Influenza A, rapid test 0.854713 Influenza B, rapid test 0.854713 Hemoglobin 0.893161 Hematocrit 0.893161 Red blood cell distribution width (RDW) 0.893338 Platelets 0.893338 Mean corpuscular volume (MCV) 0.893338 Eosinophils 0.893338 Mean corpuscular hemoglobin (MCH) 0.893338 Basophils 0.893338 Leukocytes 0.893338 Mean corpuscular hemoglobin concentration (MCHC) 0.893338 Lymphocytes 0.893338 Red blood Cells 0.893338 Monocytes 0.893515 Mean platelet volume 0.893870 Neutrophils 0.909107 Proteina C reativa mg/dL 0.910347 Creatinine 0.924876 Urea 0.929660 Potassium 0.934266 Sodium 0.934444 Strepto A 0.941176 Aspartate transaminase 0.959957 Alanine transaminase 0.960135 Serum Glucose 0.963147 Total Bilirubin 0.967753 Direct Bilirubin 0.967753 Indirect Bilirubin 0.967753 Gamma-glutamyltransferase 0.972892 Alkaline phosphatase 0.974486 HCO3 (venous blood gas analysis) 0.975904 pH (venous blood gas analysis) 0.975904 Total CO2 (venous blood gas analysis) 0.975904 Base excess (venous blood gas analysis) 0.975904 pO2 (venous blood gas analysis) 0.975904 pCO2 (venous blood gas analysis) 0.975904 Hb saturation (venous blood gas analysis) 0.975904 International normalized ratio (INR) 0.976435 Creatine phosphokinase (CPK) 0.981573 Lactic Dehydrogenase 0.982105 Myeloblasts 0.982814 Myelocytes 0.982814 Metamyelocytes 0.982814 Promyelocytes 0.982814 Rods # 0.982814 Segmented 0.982814 Relationship (Patient/Normal) 0.983877 Urine - Crystals 0.987597 Urine - Color 0.987597 Urine - Yeasts 0.987597 Urine - Red blood cells 0.987597 Urine - Leukocytes 0.987597 Urine - Density 0.987597 Urine - Bile pigments 0.987597 Urine - Hemoglobin 0.987597 Urine - pH 0.987597 Urine - Aspect 0.987597 Urine - Urobilinogen 0.987775 Urine - Granular cylinders 0.987775 Urine - Hyaline cylinders 0.988129 Urine - Protein 0.989369 Urine - Esterase 0.989369 Urine - Ketone Bodies 0.989901 Ionized calcium 0.991141 Magnesium 0.992913 ctO2 (arterial blood gas analysis) 0.995216 Hb saturation (arterial blood gases) 0.995216 pH (arterial blood gas analysis) 0.995216 Arterial Lactic Acid 0.995216 Total CO2 (arterial blood gas analysis) 0.995216 pCO2 (arterial blood gas analysis) 0.995216 HCO3 (arterial blood gas analysis) 0.995216 pO2 (arterial blood gas analysis) 0.995216 Base excess (arterial blood gas analysis) 0.995216 Ferritin 0.995925 Arteiral Fio2 0.996456 Phosphor 0.996456 Albumin 0.997697 Lipase dosage 0.998583 Vitamin B12 0.999468 Urine - Nitrite 0.999823 Fio2 (venous blood gas analysis) 0.999823 Partial thromboplastin time (PTT) 1.000000 Urine - Sugar 1.000000 Mycoplasma pneumoniae 1.000000 D-Dimer 1.000000 Prothrombin time (PT), Activity 1.000000 dtype: float64
#Suppression des colonnes inutiles ayant plus de 90% de NaN
df = df[df.columns[df.isna().sum()/df.shape[0] < 0.9]]
sns.heatmap(df.isna())
<AxesSubplot: >
#Suppression de la colonne Patient ID
df = df.drop('Patient ID', axis=1)
df
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5639 | 3 | positive | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5640 | 17 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5641 | 4 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5642 | 10 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5643 | 19 | positive | 0 | 0 | 0 | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5644 rows × 38 columns
#Pourcentage de cas de SARS-COV-2
df['SARS-Cov-2 exam result'].value_counts(normalize=True)
negative 0.901134 positive 0.098866 Name: SARS-Cov-2 exam result, dtype: float64
for col in df.select_dtypes('float'):
print(col)
Hematocrit Hemoglobin Platelets Mean platelet volume Red blood Cells Lymphocytes Mean corpuscular hemoglobin concentration (MCHC) Leukocytes Basophils Mean corpuscular hemoglobin (MCH) Eosinophils Mean corpuscular volume (MCV) Monocytes Red blood cell distribution width (RDW)
for col in df.select_dtypes('float'):
plt.figure()
sns.histplot(df[col], kde=True )
sns.histplot(df['Patient age quantile'], bins=20, kde=True)
<AxesSubplot: xlabel='Patient age quantile', ylabel='Count'>
df['SARS-Cov-2 exam result'].unique()
array(['negative', 'positive'], dtype=object)
for col in df.select_dtypes('object'):
print(f'{col:-<40} {df[col].unique()}')
SARS-Cov-2 exam result------------------ ['negative' 'positive'] Respiratory Syncytial Virus------------- [nan 'not_detected' 'detected'] Influenza A----------------------------- [nan 'not_detected' 'detected'] Influenza B----------------------------- [nan 'not_detected' 'detected'] Parainfluenza 1------------------------- [nan 'not_detected' 'detected'] CoronavirusNL63------------------------- [nan 'not_detected' 'detected'] Rhinovirus/Enterovirus------------------ [nan 'detected' 'not_detected'] Coronavirus HKU1------------------------ [nan 'not_detected' 'detected'] Parainfluenza 3------------------------- [nan 'not_detected' 'detected'] Chlamydophila pneumoniae---------------- [nan 'not_detected' 'detected'] Adenovirus------------------------------ [nan 'not_detected' 'detected'] Parainfluenza 4------------------------- [nan 'not_detected' 'detected'] Coronavirus229E------------------------- [nan 'not_detected' 'detected'] CoronavirusOC43------------------------- [nan 'not_detected' 'detected'] Inf A H1N1 2009------------------------- [nan 'not_detected' 'detected'] Bordetella pertussis-------------------- [nan 'not_detected' 'detected'] Metapneumovirus------------------------- [nan 'not_detected' 'detected'] Parainfluenza 2------------------------- [nan 'not_detected'] Influenza B, rapid test----------------- [nan 'negative' 'positive'] Influenza A, rapid test----------------- [nan 'negative' 'positive']
for col in df.select_dtypes('object'):
plt.figure()
df[col].value_counts().plot.pie(autopct='%1.1f%%')
positive_df = df[df['SARS-Cov-2 exam result']=='positive']
negative_df = df[df['SARS-Cov-2 exam result']=='negative']
missing_rate = df.isna().sum() / df.shape[0]
df.columns[(missing_rate < 0.9) & (missing_rate > 0.88)]
Index(['Hematocrit', 'Hemoglobin', 'Platelets', 'Mean platelet volume ',
'Red blood Cells', 'Lymphocytes',
'Mean corpuscular hemoglobin concentration (MCHC)', 'Leukocytes',
'Basophils', 'Mean corpuscular hemoglobin (MCH)', 'Eosinophils',
'Mean corpuscular volume (MCV)', 'Monocytes',
'Red blood cell distribution width (RDW)'],
dtype='object')
blood_columns = df.columns[(missing_rate < 0.9) & (missing_rate > 0.88)]
viral_columns = df.columns[(missing_rate < 0.88) & (missing_rate > 0.75)]
for col in blood_columns:
plt.figure()
sns.displot(positive_df[col], label='positive', kde=True)
sns.displot(negative_df[col], label='negative', kde=True)
plt.legend()
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:447: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. fig = plt.figure(figsize=figsize)
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
sns.countplot(x='Patient age quantile', hue='SARS-Cov-2 exam result', data=df)
<AxesSubplot: xlabel='Patient age quantile', ylabel='count'>
pd.crosstab(df['SARS-Cov-2 exam result'], df['Influenza A'])
| Influenza A | detected | not_detected |
|---|---|---|
| SARS-Cov-2 exam result | ||
| negative | 18 | 1224 |
| positive | 0 | 112 |
for col in viral_columns:
plt.figure()
sns.heatmap(pd.crosstab(df['SARS-Cov-2 exam result'], df[col]), annot=True, fmt='d')
sns.pairplot(df[blood_columns])
<seaborn.axisgrid.PairGrid at 0x2349deae280>
sns.heatmap(df[blood_columns].corr())
<AxesSubplot: >
sns.clustermap(df[blood_columns].corr())
<seaborn.matrix.ClusterGrid at 0x234ac128100>
for col in blood_columns:
plt.figure()
sns.lmplot(x='Patient age quantile', y=col, hue='SARS-Cov-2 exam result', data=df)
C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\695644951.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. plt.figure()
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
df.corr()['Patient age quantile'].sort_values()
Leukocytes -0.166386 Platelets -0.158683 Lymphocytes -0.125935 Mean corpuscular hemoglobin concentration (MCHC) -0.124671 Red blood Cells -0.037510 Patient addmited to intensive care unit (1=yes, 0=no) -0.035772 Patient addmited to semi-intensive unit (1=yes, 0=no) 0.015736 Eosinophils 0.022085 Patient addmited to regular ward (1=yes, 0=no) 0.046166 Monocytes 0.050962 Hemoglobin 0.060320 Hematocrit 0.096808 Basophils 0.107525 Mean platelet volume 0.119449 Red blood cell distribution width (RDW) 0.166429 Mean corpuscular hemoglobin (MCH) 0.197394 Mean corpuscular volume (MCV) 0.281655 Patient age quantile 1.000000 Name: Patient age quantile, dtype: float64
pd.crosstab(df['Influenza A'], df['Influenza A, rapid test'])
| Influenza A, rapid test | negative | positive |
|---|---|---|
| Influenza A | ||
| detected | 2 | 4 |
| not_detected | 245 | 15 |
pd.crosstab(df['Influenza B'], df['Influenza B, rapid test'])
| Influenza B, rapid test | negative | positive |
|---|---|---|
| Influenza B | ||
| detected | 18 | 11 |
| not_detected | 233 | 4 |
np.sum(df[viral_columns[:-2]] == "detected", axis=1).plot()
<AxesSubplot: >
np.sum(df[viral_columns[:-2]] == "detected", axis=1) >= 1
0 False
1 True
2 False
3 False
4 True
...
5639 False
5640 False
5641 False
5642 False
5643 False
Length: 5644, dtype: bool
df['est malade'] = np.sum(df[viral_columns[:-2]] == "detected", axis=1) >= 1
malade_df = df[df['est malade'] == True]
non_malade_df = df[df['est malade'] == False]
for col in blood_columns:
plt.figure()
sns.distplot(malade_df[col], label='malade')
sns.distplot(non_malade_df[col], label='non malade')
plt.legend()
C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(malade_df[col], label='malade') C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2167010223.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(non_malade_df[col], label='non malade')
def hospitalisation(df):
if df['Patient addmited to regular ward (1=yes, 0=no)'] == 1:
return 'surveillance'
elif df['Patient addmited to semi-intensive unit (1=yes, 0=no)'] == 1:
return 'soins semi-intensives'
elif df['Patient addmited to intensive care unit (1=yes, 0=no)'] == 1:
return 'soins intensifs'
else:
return 'inconnu'
df['statut'] = df.apply(hospitalisation, axis=1)
df.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | est malade | statut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative | True | inconnu |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | True | inconnu |
for col in blood_columns:
plt.figure()
for cat in df['statut'].unique():
sns.distplot(df[df['statut']==cat][col], label=cat)
plt.legend()
C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat) C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\4182288631.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df[df['statut']==cat][col], label=cat)
blood_columns
Index(['Hematocrit', 'Hemoglobin', 'Platelets', 'Mean platelet volume ',
'Red blood Cells', 'Lymphocytes',
'Mean corpuscular hemoglobin concentration (MCHC)', 'Leukocytes',
'Basophils', 'Mean corpuscular hemoglobin (MCH)', 'Eosinophils',
'Mean corpuscular volume (MCV)', 'Monocytes',
'Red blood cell distribution width (RDW)'],
dtype='object')
viral_columns
Index(['Respiratory Syncytial Virus', 'Influenza A', 'Influenza B',
'Parainfluenza 1', 'CoronavirusNL63', 'Rhinovirus/Enterovirus',
'Coronavirus HKU1', 'Parainfluenza 3', 'Chlamydophila pneumoniae',
'Adenovirus', 'Parainfluenza 4', 'Coronavirus229E', 'CoronavirusOC43',
'Inf A H1N1 2009', 'Bordetella pertussis', 'Metapneumovirus',
'Parainfluenza 2', 'Influenza B, rapid test',
'Influenza A, rapid test'],
dtype='object')
sns.heatmap(df.isna())
<AxesSubplot: >
df.dropna().count()
Patient age quantile 99 SARS-Cov-2 exam result 99 Patient addmited to regular ward (1=yes, 0=no) 99 Patient addmited to semi-intensive unit (1=yes, 0=no) 99 Patient addmited to intensive care unit (1=yes, 0=no) 99 Hematocrit 99 Hemoglobin 99 Platelets 99 Mean platelet volume 99 Red blood Cells 99 Lymphocytes 99 Mean corpuscular hemoglobin concentration (MCHC) 99 Leukocytes 99 Basophils 99 Mean corpuscular hemoglobin (MCH) 99 Eosinophils 99 Mean corpuscular volume (MCV) 99 Monocytes 99 Red blood cell distribution width (RDW) 99 Respiratory Syncytial Virus 99 Influenza A 99 Influenza B 99 Parainfluenza 1 99 CoronavirusNL63 99 Rhinovirus/Enterovirus 99 Coronavirus HKU1 99 Parainfluenza 3 99 Chlamydophila pneumoniae 99 Adenovirus 99 Parainfluenza 4 99 Coronavirus229E 99 CoronavirusOC43 99 Inf A H1N1 2009 99 Bordetella pertussis 99 Metapneumovirus 99 Parainfluenza 2 99 Influenza B, rapid test 99 Influenza A, rapid test 99 est malade 99 statut 99 dtype: int64
df[blood_columns].count()
Hematocrit 603 Hemoglobin 603 Platelets 602 Mean platelet volume 599 Red blood Cells 602 Lymphocytes 602 Mean corpuscular hemoglobin concentration (MCHC) 602 Leukocytes 602 Basophils 602 Mean corpuscular hemoglobin (MCH) 602 Eosinophils 602 Mean corpuscular volume (MCV) 602 Monocytes 601 Red blood cell distribution width (RDW) 602 dtype: int64
df[viral_columns].count()
Respiratory Syncytial Virus 1354 Influenza A 1354 Influenza B 1354 Parainfluenza 1 1352 CoronavirusNL63 1352 Rhinovirus/Enterovirus 1352 Coronavirus HKU1 1352 Parainfluenza 3 1352 Chlamydophila pneumoniae 1352 Adenovirus 1352 Parainfluenza 4 1352 Coronavirus229E 1352 CoronavirusOC43 1352 Inf A H1N1 2009 1352 Bordetella pertussis 1352 Metapneumovirus 1352 Parainfluenza 2 1352 Influenza B, rapid test 820 Influenza A, rapid test 820 dtype: int64
df1 = df[viral_columns[:-2]]
df1['covid']= df['SARS-Cov-2 exam result']
df1.dropna()['covid'].value_counts(normalize=True)
C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\2316125735.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df1['covid']= df['SARS-Cov-2 exam result']
negative 0.91716 positive 0.08284 Name: covid, dtype: float64
df2 = df[blood_columns[:-2]]
df2['covid']= df['SARS-Cov-2 exam result']
df2.dropna()['covid'].value_counts(normalize=True)
C:\Users\ILIAS SABANI\AppData\Local\Temp\ipykernel_9916\3602602215.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df2['covid']= df['SARS-Cov-2 exam result']
negative 0.864775 positive 0.135225 Name: covid, dtype: float64
from scipy.stats import ttest_ind
positive_df.shape
(558, 38)
negative_df.shape
(5086, 38)
balanced_neg = negative_df.sample(positive_df.shape[0])
def t_test(col):
alpha=0.02
stat, p = ttest_ind(balanced_neg[col].dropna(), positive_df[col].dropna())
if p < alpha:
return 'HO Rejetée'
else:
return 0
for col in blood_columns:
print(f'{col :-<50} {t_test(col)}')
Hematocrit---------------------------------------- 0 Hemoglobin---------------------------------------- 0 Platelets----------------------------------------- HO Rejetée Mean platelet volume ----------------------------- 0 Red blood Cells----------------------------------- 0 Lymphocytes--------------------------------------- 0 Mean corpuscular hemoglobin concentration (MCHC)-- 0 Leukocytes---------------------------------------- HO Rejetée Basophils----------------------------------------- HO Rejetée Mean corpuscular hemoglobin (MCH)----------------- 0 Eosinophils--------------------------------------- HO Rejetée Mean corpuscular volume (MCV)--------------------- 0 Monocytes----------------------------------------- HO Rejetée Red blood cell distribution width (RDW)----------- 0
data = df.copy()
data.head()
| Patient age quantile | SARS-Cov-2 exam result | Patient addmited to regular ward (1=yes, 0=no) | Patient addmited to semi-intensive unit (1=yes, 0=no) | Patient addmited to intensive care unit (1=yes, 0=no) | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | Influenza B, rapid test | Influenza A, rapid test | est malade | statut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 1 | 17 | negative | 0 | 0 | 0 | 0.236515 | -0.02234 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.95079 | -0.09461 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | negative | negative | True | inconnu |
| 2 | 8 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 3 | 5 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | inconnu |
| 4 | 15 | negative | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | NaN | NaN | True | inconnu |
missing_rate = data.isna().sum()/data.shape[0]
blood_columns = list(data.columns[(missing_rate < 0.9)&(missing_rate > 0.88)])
viral_columns = list(data.columns[(missing_rate < 0.8)&(missing_rate > 0.75)])
key_columns = ['Patient age quantile', 'SARS-Cov-2 exam result']
data = data[key_columns + blood_columns + viral_columns]
data
| Patient age quantile | SARS-Cov-2 exam result | Hematocrit | Hemoglobin | Platelets | Mean platelet volume | Red blood Cells | Lymphocytes | Mean corpuscular hemoglobin concentration (MCHC) | Leukocytes | Basophils | Mean corpuscular hemoglobin (MCH) | Eosinophils | Mean corpuscular volume (MCV) | Monocytes | Red blood cell distribution width (RDW) | Respiratory Syncytial Virus | Influenza A | Influenza B | Parainfluenza 1 | CoronavirusNL63 | Rhinovirus/Enterovirus | Coronavirus HKU1 | Parainfluenza 3 | Chlamydophila pneumoniae | Adenovirus | Parainfluenza 4 | Coronavirus229E | CoronavirusOC43 | Inf A H1N1 2009 | Bordetella pertussis | Metapneumovirus | Parainfluenza 2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 17 | negative | 0.236515 | -0.022340 | -0.517413 | 0.010677 | 0.102004 | 0.318366 | -0.950790 | -0.094610 | -0.223767 | -0.292269 | 1.482158 | 0.166192 | 0.357547 | -0.625073 | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected |
| 2 | 8 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 5 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 15 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | not_detected | not_detected | not_detected | not_detected | not_detected | detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected | not_detected |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5639 | 3 | positive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5640 | 17 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5641 | 4 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5642 | 10 | negative | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5643 | 19 | positive | 0.694287 | 0.541564 | -0.906829 | -0.325903 | 0.578024 | -0.295726 | -0.353319 | -1.288428 | -1.140144 | -0.135455 | -0.835508 | 0.025985 | 0.567652 | -0.182790 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5644 rows × 33 columns
from sklearn.model_selection import *
trainset, testset = train_test_split(data, test_size=0.2, random_state=0)
trainset['SARS-Cov-2 exam result'].value_counts()
negative 4068 positive 447 Name: SARS-Cov-2 exam result, dtype: int64
testset['SARS-Cov-2 exam result'].value_counts()
negative 1018 positive 111 Name: SARS-Cov-2 exam result, dtype: int64
def encodage(data):
code = {'positive':1,
'negative':0,
'detected':1,
'not_detected':0}
for col in data.select_dtypes('object').columns:
data.loc[:,col] = data[col].map(code)
return data
def feature_engineering(data):
data['est malade'] = data[viral_columns].sum(axis=1) >= 1
data = data.drop(viral_columns, axis=1)
return data
def imputation(data):
#data['is na'] = (data['Parainfluenza 3'].isna()) | (data['Leukocytes'].isna())
#data = data.fillna(-999)
data = data.dropna(axis=0)
return data
def preprocessing(data):
data = encodage(data)
data = feature_engineering(data)
data = imputation(data)
X = data.drop('SARS-Cov-2 exam result', axis=1)
y = data['SARS-Cov-2 exam result']
print(y.value_counts())
return X, y
X_train, y_train = preprocessing(trainset)
0 422 1 65 Name: SARS-Cov-2 exam result, dtype: int64
X_test, y_test = preprocessing(testset)
0 95 1 16 Name: SARS-Cov-2 exam result, dtype: int64
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.pipeline import *
from sklearn.feature_selection import *
from sklearn.preprocessing import *
from sklearn.decomposition import *
from sklearn.svm import *
from sklearn.neighbors import *
preprocessor = make_pipeline(PolynomialFeatures(2), SelectKBest(f_classif, k=10))
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())
list_of_models = {'RandomForest': RandomForest,
'AdaBoost': AdaBoost,
'SVM': SVM,
'KNN': KNN
}
for name, model in list_of_models.items():
print(name)
evaluation(model)
RandomForest
[[91 4]
[11 5]]
precision recall f1-score support
0 0.89 0.96 0.92 95
1 0.56 0.31 0.40 16
accuracy 0.86 111
macro avg 0.72 0.64 0.66 111
weighted avg 0.84 0.86 0.85 111
AdaBoost
[[91 4]
[ 9 7]]
precision recall f1-score support
0 0.91 0.96 0.93 95
1 0.64 0.44 0.52 16
accuracy 0.88 111
macro avg 0.77 0.70 0.73 111
weighted avg 0.87 0.88 0.87 111
SVM
[[92 3]
[10 6]]
precision recall f1-score support
0 0.90 0.97 0.93 95
1 0.67 0.38 0.48 16
accuracy 0.88 111
macro avg 0.78 0.67 0.71 111
weighted avg 0.87 0.88 0.87 111
KNN
[[88 7]
[ 8 8]]
precision recall f1-score support
0 0.92 0.93 0.92 95
1 0.53 0.50 0.52 16
accuracy 0.86 111
macro avg 0.72 0.71 0.72 111
weighted avg 0.86 0.86 0.86 111
from sklearn.metrics import *
from sklearn.model_selection import *
def evaluation(model):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(model, X_train, y_train,
cv=4, scoring='f1',
train_sizes=np.linspace(0.1,1,10))
plt.figure(figsize=(12,8))
plt.plot(N,train_score.mean(axis=1), label='train score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
evaluation(model)
[[91 4]
[11 5]]
precision recall f1-score support
0 0.89 0.96 0.92 95
1 0.56 0.31 0.40 16
accuracy 0.86 111
macro avg 0.72 0.64 0.66 111
weighted avg 0.84 0.86 0.85 111
pd.DataFrame(model.feature_importances_, index = X_train.columns).plot.bar()
<AxesSubplot: >
from sklearn.model_selection import *
SVM
Pipeline(steps=[('pipeline',
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
('selectkbest', SelectKBest())])),
('standardscaler', StandardScaler()),
('svc', SVC(random_state=0))])
hyper_params = {'svc__gamma' : [1e-3, 1e-4],
'svc__C': [1, 10, 100, 1000],
'pipeline__polynomialfeatures__degree':[2,3,4,5],
'pipeline__selectkbest__k':range(40,100)
}
grid = RandomizedSearchCV(SVM, hyper_params, scoring='recall', cv=4, n_iter=50)
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))
{'svc__gamma': 0.001, 'svc__C': 1000, 'pipeline__selectkbest__k': 56, 'pipeline__polynomialfeatures__degree': 3}
precision recall f1-score support
0 0.92 0.93 0.92 95
1 0.53 0.50 0.52 16
accuracy 0.86 111
macro avg 0.72 0.71 0.72 111
weighted avg 0.86 0.86 0.86 111
evaluation(grid.best_estimator_)
[[88 7]
[ 8 8]]
precision recall f1-score support
0 0.92 0.93 0.92 95
1 0.53 0.50 0.52 16
accuracy 0.86 111
macro avg 0.72 0.71 0.72 111
weighted avg 0.86 0.86 0.86 111
from sklearn.metrics import *
precision, recall, threshold = precision_recall_curve(y_test, grid.best_estimator_.decision_function(X_test))
plt.plot(threshold, precision[:-1], label='precision')
plt.plot(threshold, recall[:-1], label='recall')
plt.legend()
<matplotlib.legend.Legend at 0x234b59aaaf0>
def model_final(model, X, threshold=0):
return model.decision_function(X) > threshold
y_pred = model_final(grid.best_estimator_, X_test, threshold=0)
f1_score(y_test, y_pred)
0.14285714285714285
recall_score(y_test, y_pred)
0.5